# =============================================================================
# IV-2SLS CROSS-SECTIONAL ROBUSTNESS (Keep Productivity, No Sector FE)
# =============================================================================

import pandas as pd
import statsmodels.api as sm
from linearmodels.iv import IV2SLS
from datetime import datetime

print("=== IV-2SLS CROSS-SECTIONAL (Keep Productivity, No Sector FE) ===")

df = pd.read_csv("dataset.csv")
print(f"Loaded dataset: {len(df):,} observations")

df['prov_sec'] = df['province_id'].astype(str) + "_" + df['sector_id'].astype(str)

numeric_cols = ['ln_wage', 'spe', 'div', 'den', 'ln_fdi', 'ln_productivity_lag', 'skilled_share',
                'mean_tri', 'mean_slope', 'std_slope']

df_avg = df.groupby(['prov_sec', 'province_id', 'sector_id'])[numeric_cols].mean().reset_index()

print(f"Averaged cross-section: {len(df_avg):,} observations")

exog_vars = ['spe', 'div', 'ln_fdi', 'ln_productivity_lag', 'skilled_share']
exog = sm.add_constant(df_avg[exog_vars])

endog = df_avg[['den']]
instruments = df_avg[['mean_tri', 'mean_slope', 'std_slope']]

iv_model = IV2SLS(dependent=df_avg['ln_wage'],
                  exog=exog,
                  endog=endog,
                  instruments=instruments)

iv_results = iv_model.fit(cov_type='robust')

print("\n" + "="*100)
print("IV-2SLS CROSS-SECTIONAL ROBUSTNESS (Keep Productivity, No Sector FE)")
print("="*100)
print(iv_results.summary)

# ====================== FIRST-STAGE RESULTS ======================
print("\n" + "="*80)
print("FIRST-STAGE REGRESSION (den on terrain instruments + controls)")
print("="*80)
print(iv_results.first_stage.summary)

# Safe extraction of Partial F-statistic
fs_diag = iv_results.first_stage.diagnostics
f_stat = fs_diag.loc['den', 'f.stat']
f_pval = fs_diag.loc['den', 'f.pval']

print("\n" + "-"*70)
print("INSTRUMENT STRENGTH TEST")
print(f"Partial F-statistic on excluded instruments : {f_stat:.2f}")
print(f"P-value                                      : {f_pval:.4f}")

if f_stat > 100:
    print("→ Instruments are **EXCELLENT / VERY STRONG** (highly credible)")
elif f_stat > 20:
    print("→ Instruments are strong")
else:
    print("→ Instruments are weak")
print("-"*70)

# Main results table
main_vars = ['den', 'spe', 'div', 'ln_fdi', 'ln_productivity_lag', 'skilled_share']
iv_table = pd.DataFrame({
    'Coefficient': iv_results.params.loc[main_vars].round(4),
    'Std. Error': iv_results.std_errors.loc[main_vars].round(4),
    't-stat': iv_results.tstats.loc[main_vars].round(3),
    'P>|t|': iv_results.pvalues.loc[main_vars].round(4)
})

print("\nMAIN SECOND-STAGE RESULTS TABLE:")
print(iv_table)

# Save
with open("iv_cross_sectional_results.txt", "w", encoding="utf-8") as f:
    f.write("="*100 + "\n")
    f.write("IV-2SLS CROSS-SECTIONAL ROBUSTNESS (Keep Productivity, No Sector FE)\n")
    f.write(f"Date: {datetime.now().strftime('%B %d, %Y at %H:%M')}\n")
    f.write(f"Observations: {len(df_avg):,}\n")
    f.write(f"First-stage Partial F-statistic: {f_stat:.2f}\n")
    f.write(f"P-value (Partial F-stat)       : {f_pval:.4f}\n\n")
    f.write(iv_table.to_string(float_format="{:0.4f}".format))

print("\n✅ Results saved to: iv_cross_sectional_results.txt")